Set up

using <- function(...) {
    libs <- unlist(list(...))
    need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
    if(length(need) > 0){ 
        install.packages(need, repos = "https://cloud.r-project.org")
        need <- need[!unlist(lapply(need, require, character.only = T))]
        if (length(need) > 0) {
          if (!requireNamespace("BiocManager", quietly = T))
            install.packages("BiocManager")
          BiocManager::install(need)
        }
    }
    lapply(libs, require, character.only = T)
}
using("data.table", "tidyverse", "smacof", "DESeq2", "InteractionSet", "diffHic",
      "broom", "highcharter", "heatmaply", "plotly", "eulerr")

Import

We’ll have to to read in some data, though it’s already been done and the output saved in lec12.RData.

load('lec12.RData')

This “Import” section doesn’t need to be ran in class, they’re just for your reference


HiCRep

While we have one correlation coefficient per chromosome, we can take the average (weighted by chromosome length) to obtain a representative value per pairwise comparison

# relative size of each chromosome
f <- fread('mm10.chrom.sizes')$V2 %>% {. / sum(.)}

# we take the weighted average of SCC across chromosomes
hr <- list.files('hicrep', full.names = T) %>% 
  setNames(., sub('.txt', '', basename(.))) %>%
  sapply(function(x) sum(fread(x, skip = 2)$V1 * f)) %>% 
  data.frame(c = .) %>%
  rownames_to_column('p') %>%
  separate(p, c('s1', 's2'), '\\.') %>% 
  pivot_wider(names_from = s2, values_from = 'c') %>%
  column_to_rownames('s1')
ESC_1 ESC_2 ESC_3 ESC_4 ESC NPC_1 NPC_2 NPC_3 NPC_4 NPC
ESC_1 1 0.983 0.953 0.966 0.985 0.801 0.804 0.826 0.825 0.821
ESC_2 0.983 1 0.985 0.988 0.997 0.856 0.858 0.879 0.878 0.875
ESC_3 0.953 0.985 1 0.99 0.99 0.891 0.898 0.91 0.912 0.91
ESC_4 0.966 0.988 0.99 1 0.992 0.884 0.889 0.899 0.901 0.9
ESC 0.985 0.997 0.99 0.992 1 0.861 0.866 0.883 0.884 0.88
NPC_1 0.801 0.856 0.891 0.884 0.861 1 0.978 0.973 0.971 0.983
NPC_2 0.804 0.858 0.898 0.889 0.866 0.978 1 0.99 0.99 0.996
NPC_3 0.826 0.879 0.91 0.899 0.883 0.973 0.99 1 0.996 0.997
NPC_4 0.825 0.878 0.912 0.901 0.884 0.971 0.99 0.996 1 0.997
NPC 0.821 0.875 0.91 0.9 0.88 0.983 0.996 0.997 0.997 1

P(s)

Instead of the complete output, we just take the logbin summarized files for visualization

ps <- list.files('exp/1000/cis', full.names = T) %>% 
  grep('der|log', ., value = T) %>% 
  split(., sub('.*\\.(.*)\\.tsv', '\\1', .)) %>%
  lapply(function(x) {
    split(x, sub('\\..*', '', basename(x))) %>%
      lapply(fread)
  })
head(ps$log$ESC)

Domains

ins <- list.files('ins/10000/100000', pattern = 'tsv', full.names = T)  %>%
  grep('ESC|NPC', ., value = T) %>%
  setNames(., sub('.tsv', '', basename(.))) %>%
  lapply(fread)
head(ins$ESC)

Dots

The outputs from cooltools call-dots are directly read in

dots <- list.files('dots', pattern = 'postproc', full.names = T) %>%
  setNames(., sub('\\..*', '', basename(.))) %>%
  lapply(fread)

RNA-seq

Omitted since the process of constructing a DESeqDataSet from Salmon outputs has been described before


Matrix similarity

There’s a number of ways through which we can examine the correlation matrix

Distribution

hr %>%
  {.[upper.tri(., diag = T)] <- NA; .} %>%
  rownames_to_column('s1') %>%
  pivot_longer(-s1, names_to = 's2', values_to = 'c') %>%
  na.omit() %>%
  mutate(c1 = sub('_.*', '', s1),
         c2 = sub('_.*', '', s2),
         kind = ifelse(c1 == c2, 'same', 'diff')) %>%
  plot_ly(x = ~kind, y = ~c, color = ~kind, type = 'box')

Hiearchical clustering

tibble(s = names(hr)) %>%
  mutate(type = factor(sub('_.*', '', s))) %>%
  column_to_rownames('s') %>%
  heatmaply(hr, row_side_colors = ., col_side_colors = .,
            label_names = c("Sample 1", "Sample 2", "SCC"))

MDS

sim2diss(hr, method = 'corr') %>%
  mds(ndim = 2) %>%
  .$conf %>%
  as.data.frame() %>%
  rownames_to_column('s') %>%
  mutate(type = sub('_.*', '', s)) %>%
  plot_ly(x = ~D1, y = ~D2, color = ~type, text = ~s) %>%
  add_markers(legendgroup = ~type) %>%
  add_text(textposition = 'top left', showlegend = F, legendgroup = ~type)


P(s)

For the contact probability decay curves and their derivatives that we’ve computed before, we can again just simply visualize them

# colors for each cell type
hclrs <- c(ESC = "#7cb5ec", NPC = "#434348")

# plot data
pd <- ps %>%
  lapply(function(x) {
    d <- rbindlist(x, idcol = 'samp')
    ifelse('slope' %in% names(d), 'slope', 'balanced.avg') %>%
      c('samp', 's_bp', .) %>%
      d[, ., with = F] %>%
      `colnames<-`(c('samp', 'x', 'y'))
  }) %>%
  rbindlist(idcol = 'kind') %>%
  group_by(kind, samp) %>%
  do(data = list_parse2(data.frame(.$x, .$y))) %>%
  ungroup() %>%
  separate(samp, c('ctype', 'rep'), '_', F, fill = 'right') %>%
  #dplyr::filter(is.na(rep)) %>%
  mutate(color = hclrs[ctype],
         ctype = factor(ctype, names(hclrs))) %>%
  arrange(ctype) %>%
  mutate(name = samp,
         id = ifelse(kind != 'log', tolower(name), NA),
         linkedTo = ifelse(kind == 'log', tolower(name), NA),
         yAxis = ifelse(kind == 'log', 0, 2))

highchart() %>%
  # we use log scale for P(s) and linear for the slope (which was already taken in log space)
  hc_yAxis_multiples(list(title = list(text = "<b>Contact probability</b>, P<sub>c</sub>(s)",
                                       useHTML = TRUE),
                          type = "logarithmic",
                          labels = list(formatter = JS("function(){return this.value.toExponential(0);}")),
                          height = '45%', top = '0%', offset = 0),
                     list(height = '10%', top = '45%',
                          title = NULL,
                          plotLines = list(
                            list(color = "#a9a9a9", width = 2,
                                 value = .5, zIndex = 1)
                          ),
                          labels = list(enabled = F),
                          gridLineWidth = 0,
                          min = 0, max = 1),
                     list(type = "linear",
                          title = list(
                            text = "<b>Slope</b>, <sup>d</sup>&frasl;<sub>ds</sub> log P<sub>c</sub>(s)",
                            useHTML = TRUE
                          ),
                          height = '45%', top = '55%', offset = 0)) %>%
  # grab the data
  hc_add_series_list(pd) %>%
  # a bit of formatting
  hc_xAxis(type = "logarithmic",
           title = list(text = "<b>Genomic separation</b> (bp), s",
                        useHTML = T),
           minorTickInterval = 'auto',
           min = 1e4, max = 1e8) %>%
  hc_tooltip(headerFormat = '<span style="font-size: 10px">{point.key:,.0f} bp</span><br/>') %>%
  hc_chart(zoomType = "xy") %>%
  hc_plotOptions(line = list(marker = list(enabled = F)))

Compartments

Next we’ll check out the first eigenvector tracks (i.e., “compartment score”)

Similarity

Scatter

d <- lapply(dat, `[[`, 'E1') %>% bind_cols() %>% na.omit()
x <- d$ESC
y <- d$NPC
subplot(
  plot_ly(x = x, color = I(hclrs['ESC']), type = 'histogram'), 
  plotly_empty(), 
  plot_ly(x = x, y = y, type = 'histogram2dcontour', showscale = F,
          contours = list(showlines = F), ncontours = 30),
  plot_ly(y = y, color = I(hclrs['NPC']), type = 'histogram'),
  nrows = 2, heights = c(0.2, 0.8), widths = c(0.8, 0.2), 
  shareX = T, shareY = T, titleX = F, titleY = F
) %>% layout(showlegend = FALSE, xaxis = list(title = 'ESC'), yaxis2 = list(title = 'NPC'))

Overlap

# edges
lnks <- d %>%
  mutate(across(everything(), function(x) ifelse(x > 0, 'A', 'B'))) %>%
  `colnames<-`(c('from', 'to')) %>%
  dplyr::count(from, to, name = 'weight') %>%
  mutate(from = sprintf('%s.%s', 'ESC', from),
         to = sprintf('%s.%s', 'NPC', to),
         weight = 100 * weight / sum(weight)) %>%
  transpose() 

# order of cell types
odr <- names(hclrs)

# nodes
nds <- lapply(lnks, function(x) c(x$from, x$to)) %>%
  unlist() %>%
  unique() %>%
  lapply(function(x) {
    tibble(id = x) %>%
      separate(id, c('samp', 'name'), '\\.', F) %>%
      mutate(column = which(odr == samp) - 1,
             color = ifelse(name == 'A', '#6AA56E', '#B65655')) %>%
      as.list()
  })

# show as sankey
highchart() %>%
  hc_chart(type = 'sankey') %>%
  hc_add_series(data = lnks, nodes = nds) %>%
  hc_xAxis(tickPositions = c(-0.49, 3.5),
           startOnTicks = F,
           endOnTicks = F,
           lineColor = 'transparent',
           labels = list(formatter = JS("function() {
               var d;
               if (this.value < 0) {
                   d = 'ES';
               } else if (this.value > 3) {
                   d = 'NPC';
               }
               return d;
           }"), align = 'center'),
           tickLength = 0) %>%
  hc_chart(showAxes = T) %>%
  hc_tooltip(nodeFormatter = JS("function() {
                 return '<span style=\"font-size: 10px\">' + this.options.samp +
                     '</span><br/>' + this.options.name + ': ' + 
                     Highcharts.numberFormat(this.sum, 1) + '%';
             }"),
             pointFormatter = JS("function() {
                 return '<span style=\"font-size: 10px\">' +
                     this.fromNode.options.samp + '&#8594;' +
                     this.toNode.options.samp + '</span><br/>' +
                     this.fromNode.options.name + '&#8594;' +
                     this.toNode.options.name + ': ' +
                     Highcharts.numberFormat(this.weight, 1) + '%';
             }"),
             headerFormat = '',
             useHTML = T)


Epigenetic correlation

dat$ESC %>% 
  bind_cols() %>% 
  na.omit() %>%
  mutate(across(-E1, log10)) %>% 
  mutate(across(-E1, function(x) x - Input)) %>% 
  dplyr::filter(is.finite(rowSums(.))) %>% 
  dplyr::select(-Input) %>% 
  cor() %>%
  heatmaply_cor()

TADs

For the simplest comparison we can just count the number of shared boundaries

bdrs <- ins[c('ESC', 'NPC')] %>%
  lapply(function(x) {
    na.omit(x) %>%
      dplyr::filter(boundary_strength_100000 > .5) %>%
      mutate(start = start + 1) %>%
      makeGRangesFromDataFrame()
  })

Reduce(c, bdrs) %>% 
  unique() %>%
  {lapply(bdrs, function(x) overlapsAny(., x))} %>%
  bind_cols() %>%
  euler() %>%
  plot(quantities = T)


Loops

There are specific classes in R that handles paired ranges, one of which is GInteractions

lps <- dots[c('ESC', 'NPC')] %>%
  lapply(function(x) {
    list(x[,1:3], x[,4:6]) %>%
      lapply(function(y) {
        y %>%
          `colnames<-`(c('chr', 'start', 'end')) %>%
          mutate(start = start + 1) %>%
          makeGRangesFromDataFrame()
      }) %>%
      {GInteractions(.[[1]], .[[2]], mode = 'reverse')}
  })
lps$ESC
## ReverseStrictGInteractions object with 3738 interactions and 0 metadata columns:
##          seqnames1             ranges1     seqnames2             ranges2
##              <Rle>           <IRanges>         <Rle>           <IRanges>
##      [1]      chr1     4750001-4775000 ---      chr1     4500001-4525000
##      [2]      chr1     5000001-5025000 ---      chr1     4475001-4500000
##      [3]      chr1     5900001-5925000 ---      chr1     4475001-4500000
##      [4]      chr1     5900001-5925000 ---      chr1     4750001-4775000
##      [5]      chr1     5900001-5925000 ---      chr1     4900001-4925000
##      ...       ...                 ... ...       ...                 ...
##   [3734]      chrX 167800001-167825000 ---      chrX 167300001-167325000
##   [3735]      chrX 167800001-167825000 ---      chrX 167475001-167500000
##   [3736]      chrX 168925001-168950000 ---      chrX 168400001-168425000
##   [3737]      chrX 168925001-168950000 ---      chrX 168700001-168725000
##   [3738]      chrX 169275001-169300000 ---      chrX 168925001-168950000
##   -------
##   regions: 5537 ranges and 0 metadata columns
##   seqinfo: 20 sequences from an unspecified genome; no seqlengths
Reduce(c, lps) %>% 
  unique() %>%
  InteractionSet(matrix(1, length(.)), .) %>%
  clusterPairs(tol = 1, upper = 1e6) %>%
  .$interactions %>%
  {lapply(lps, function(x) overlapsAny(., x))} %>%
  bind_cols() %>%
  euler() %>%
  plot(quantities = T)


Differential interaction

As before, the results of differential interaction analysis can be visualized using the same types of figures we’ve used previously

res <- di %>%
  lapply(function(x) {
    results(x, contrast = c('condition', 'NPC', 'ESC')) %>%
      as.data.frame() %>%
      rownames_to_column('crd')
  }) %>%
  bind_rows(.id = 'chr') %>%
  separate(crd, c('i', 'j'), ':') %>%
  mutate(info = sprintf('%s:%s-%s<br>baseMean: %.3g<br>log2FoldChange: %.3g<br>FDR: %.3g',
                        chr, i, j, baseMean, log2FoldChange, padj)) 
plot_ly(res, x = ~baseMean, y = ~log2FoldChange, color = ~-log10(padj),
        type = 'scatter', mode = 'markers', customdata = ~info,
        hovertemplate = '%{customdata}<extra></extra>') %>%
  layout(xaxis = list(type = 'log'))

One simple integrative analysis is to assess the impact of differential long-range interaction in promoter regions on expression of the cognate gene.

# differential expressed genes
deg <- results(dds, contrast = c('condition', 'NPC', 'ESC')) %>%
  as.data.frame() %>%
  rownames_to_column('ID') %>%
  na.omit() %>%
  dplyr::filter(padj < .01 & abs(log2FoldChange) > 2) %>%
  mutate(dir = ifelse(log2FoldChange > 0, 'up', 'dn')) %>%
  dplyr::filter(ID %in% genes$ID[genes$gene_type == 'protein_coding']) %>%
  {split(.$ID, .$dir)}

# genes with differential promoter long-range interaction
dig <- res %>%
  na.omit() %>%
  dplyr::filter(padj < .01 & abs(log2FoldChange) > .5) %>%
  mutate(dir = ifelse(log2FoldChange > 0, 'up', 'dn')) %>%
  split(., .$dir) %>%
  lapply(function(x) {
    list(x[,c(1,2)], x[,c(1,3)]) %>%
      lapply(function(y) {
        y[[3]] <- as.numeric(y[[2]]) + 2.5e4
        colnames(y) <- c('chr', 'start', 'end')
        y
      }) %>%
      bind_rows() %>%
      mutate(start = as.numeric(start) + 1) %>%
      makeGRangesFromDataFrame() %>%
      {genes[overlapsAny(promoters(genes), .)]} %>%
      {.[.$gene_type == 'protein_coding']} %>%
      .$ID %>%
      unique()
  }) %>%
  {lapply(., setdiff, intersect(.[[1]], .[[2]]))}

# total number of quantifiable genes
tot <- results(dds) %>%
  na.omit() %>%
  rownames() %>%
  intersect(., genes$ID[genes$gene_type == 'protein_coding']) %>%
  length()

# assess overlap
lapply(deg, function(x) {
  lapply(dig, function(y) {
    m <- tibble(a = length(intersect(x, y)),
                b = length(x) - a,
                c = length(y) - a,
                d = tot - a - b - c)
    m %>%
      as.numeric() %>%
      matrix(2, 2) %>%
      fisher.test() %>%
      tidy() %>%
      cbind(m)
  }) %>% bind_rows(.id = 'di')
}) %>% bind_rows(.id = 'de') %>%
  mutate(grp = sprintf('Expression: %s<br>Interaction: %s', de, di),
         info = sprintf('Odds ratio: %.3g (%.3g - %.3g)<br>P-value: %.3g<br>Overlap: %d',
                        estimate, conf.low, conf.high, p.value, a),
         clo = estimate - conf.low,
         chi = conf.high - estimate) %>%
  plot_ly(x = ~grp, y = ~estimate, type = 'scatter', mode = 'markers', color = ~grp,
          error_y = ~list(array = chi, arrayminus = clo, color = grp), showlegend = F,
          customdata = ~info, hovertemplate = '%{customdata}<extra></extra>') %>%
  layout(xaxis = list(title = NA),
         yaxis = list(title = 'Odds ratio'),
         shapes = list(type = 'line', line = list(color = 'black'), xref = 'paper', yref = 'y',
                       x0 = 0, x1 = 1, y0 = 1, y1 = 1, layer = 'below'))